{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "42f0a3b3-4b05-48ec-91e8-627389208294",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import pickle\n",
    "from tqdm import tqdm\n",
    "\n",
    "#rest of the preprocessing is using the script provided in the TBIP repo - setup/senate_speeches_to_bag_of_words.py \n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy import sparse\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ff1dc25e-9848-4389-995a-83686b7cea94",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84063\n"
     ]
    }
   ],
   "source": [
    "raw_documents = list(map(lambda x:x.rstrip(), open(\"clean/raw_documents.txt\").readlines()))\n",
    "print(len(raw_documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "faf32525-0377-406e-bc33-37f372b6829c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84063\n"
     ]
    }
   ],
   "source": [
    "raw_data = pd.read_csv('finalized_tbip_speech_set_raw_original_data_floor_speeches_house.csv')\n",
    "speeches_og = list(raw_data['Text'])\n",
    "print(len(speeches_og))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5ea2de67-585b-4a2b-9005-ff452da09356",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59428\n"
     ]
    }
   ],
   "source": [
    "raw_docs_proc_rem = list(map(lambda x:x.rstrip(), open(\"raw_documents_without_procedural.txt\").readlines()))\n",
    "print(len(raw_docs_proc_rem))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9c3a8fb7-d5c0-40b4-9240-3f899d38c2e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "speeches_og = [document.replace(\"\\n\", ' ').replace(\"\\r\", ' ').rstrip()\n",
    "                 for i, document in enumerate(speeches_og)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "229cb1fd-56e8-44fe-a695-4ccd7f25bbe4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "speeches_og == raw_documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a0022978-4472-4788-865f-1c137e3afd2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "84063it [00:40, 2083.97it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "59428\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "retained_speech_inds = []\n",
    "for i, s in tqdm(enumerate(speeches_og)):\n",
    "    if s in raw_docs_proc_rem:\n",
    "        retained_speech_inds.append(i)\n",
    "print(len(retained_speech_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9aec5797-14e3-4fbd-ae21-9d995fc2690a",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert len(raw_docs_proc_rem) == len(retained_speech_inds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "269f5fa0-cd37-4f5c-95ff-7b03fab4b11d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 59428 entries, 0 to 84062\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  59428 non-null  object\n",
      " 1   Speaker_Name         59428 non-null  object\n",
      " 2   Text                 59428 non-null  object\n",
      " 3   Date                 59428 non-null  object\n",
      " 4   Legislative Body     59428 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 2.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data.iloc[retained_speech_inds]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f71232ee-7234-406f-9c43-8f67d9c9bc42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23\n"
     ]
    }
   ],
   "source": [
    "#remove speakers if they gave less than 25 speeches - TBIP paper removed senators with less than 24 speeches.\n",
    "speakers_to_remove_based_on_num_speeches = set()\n",
    "speakers = set(raw_data['Speaker_Bioguide_ID'])\n",
    "thresh = 25\n",
    "for s in speakers:\n",
    "    n_s = len(raw_data[raw_data['Speaker_Bioguide_ID']==s])\n",
    "    if n_s < thresh:\n",
    "        speakers_to_remove_based_on_num_speeches.add(s)\n",
    "print(len(speakers_to_remove_based_on_num_speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f4ebec62-d4ea-4f0c-a13e-3dde3eafc3f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 58941 entries, 0 to 84062\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  58941 non-null  object\n",
      " 1   Speaker_Name         58941 non-null  object\n",
      " 2   Text                 58941 non-null  object\n",
      " 3   Date                 58941 non-null  object\n",
      " 4   Legislative Body     58941 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 2.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data[~raw_data['Speaker_Bioguide_ID'].isin(speakers_to_remove_based_on_num_speeches)]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "77917b45-1e31-40f3-8095-1cc3908bd4a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1456\n"
     ]
    }
   ],
   "source": [
    "stopwords = open('stopwords.txt').readlines()\n",
    "stopwords = list(map(lambda x:x.rstrip(), stopwords))\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "7ca8f185-eb47-4a00-9d5b-4c64b4837bd7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "110\n"
     ]
    }
   ],
   "source": [
    "stopwords_proc = open('procedural_stopwords.txt').readlines()\n",
    "stopwords_proc = list(map(lambda x:x.rstrip(), stopwords_proc))\n",
    "print(len(stopwords_proc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5e1250bd-2eb4-4712-ab35-7457575141b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['mr_speaker_i', 'and_include_extraneous', 'i_ask', 'their_remarks', 'members']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords_proc[-5:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "ba08cecc-5d5b-4029-942d-4d9f322d6223",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['remarks', '5', 'revise_and_extend', 'to_revise_and', 'mr']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords_proc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "beee95c9-77c3-4ff3-a01d-b7765651980f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['remarks', '5', 'revise and extend', 'to revise and', 'mr']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords_proc = list(map(lambda x:' '.join(x.split('_')), stopwords_proc))\n",
    "stopwords_proc[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "a096bb87-1d1a-49eb-8e07-66127e7b0da8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1527\n"
     ]
    }
   ],
   "source": [
    "stopwords = list(set(stopwords).union(set(stopwords_proc)))\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "f5bb5559-bfd1-4b4f-a788-0429959a8d1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58941\n",
      "58941\n"
     ]
    }
   ],
   "source": [
    "speakers = list(raw_data['Speaker_Bioguide_ID'])\n",
    "print(len(speakers))\n",
    "speeches = list(raw_data['Text'])\n",
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "07c6d492-8bf3-427a-a34e-da266eaf22ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58941\n",
      "490\n"
     ]
    }
   ],
   "source": [
    "speaker_to_speaker_id = dict(\n",
    "    [(y, x) for x, y in enumerate(sorted(set(speakers)))])\n",
    "author_indices = np.array(\n",
    "    [speaker_to_speaker_id[s] for s in speakers])\n",
    "print(len(author_indices))\n",
    "author_map = np.array(list(speaker_to_speaker_id.keys()))\n",
    "print(len(author_map))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b1abcfaf-1cc4-494e-96d1-000ebff0b2c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58941\n"
     ]
    }
   ],
   "source": [
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "e1b9028e-5c4d-49c9-b3a2-ce7b5b145bf1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/pranavgoel/miniconda3/envs/pg3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:409: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['allard', 'andr', 'balart', 'barrag', 'col', 'colon', 'cortez', 'garc', 'gonz', 'guti', 'halleran', 'jes', 'jos', 'jr', 'legislative', 'lehtinen', 'lez', 'luj', 'mucarsel', 'nchez', 'ocasio', 'powell', 'ra', 'rdenas', 'ros', 'rourke', 'roybal', 'rrez', 'shea', 'sr', 'vel', 'wm', 'zquez'] not in stop_words.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "count_vectorizer = CountVectorizer(min_df=0.001,\n",
    "                                   max_df=0.75, \n",
    "                                   stop_words=stopwords, \n",
    "                                   ngram_range=(1, 3),\n",
    "                                   token_pattern=\"[a-zA-Z]+\")\n",
    "# Learn initial document term matrix. This is only initial because we use it to\n",
    "# identify words to exclude based on author counts.\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "22ea83fe-a347-4831-a2a6-773ed6488f76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(58941, 14090)\n",
      "14090\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6b15d430-98d6-4984-a288-ec6735434d80",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 490/490 [00:03<00:00, 129.13it/s]\n"
     ]
    }
   ],
   "source": [
    "author_to_inds = {}\n",
    "for a in tqdm(list(author_map)):\n",
    "    inds = []\n",
    "    author_ind = speaker_to_speaker_id[a]\n",
    "    for i, ind in enumerate(list(author_indices)):\n",
    "        if ind==author_ind:\n",
    "            inds.append(i)\n",
    "    author_to_inds[a] = inds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "63c0dcc0-7310-4629-b46f-6b25fefad7c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_per_author_counts(counts, author_to_inds):\n",
    "    list_of_arrays = []\n",
    "    for a in author_to_inds:\n",
    "        inds = author_to_inds[a]\n",
    "        list_of_arrays.append(np.array(np.sum(counts[inds], 0)))#.reshape((1, counts.shape[1])))\n",
    "    return np.concatenate(list_of_arrays, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "5d3f6a6a-583e-40b7-8fbc-001b2eaf4e9c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(490, 14090)\n"
     ]
    }
   ],
   "source": [
    "# Remove phrases spoken by less than 50 representatives\n",
    "min_authors_per_word = 50\n",
    "counts_per_author = get_per_author_counts(counts, author_to_inds)\n",
    "print(counts_per_author.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "7dd8dd78-10f3-4bb7-b3d0-272ab12a804e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11433\n"
     ]
    }
   ],
   "source": [
    "acceptable_words = []\n",
    "for i in range(len(vocabulary)):\n",
    "    if np.count_nonzero(counts_per_author[:, i]) >= min_authors_per_word:\n",
    "        acceptable_words.append(i)\n",
    "print(len(acceptable_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "0b8f5230-1e9a-4bea-bec1-b9146a4499e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(58941, 11433)\n",
      "11433\n"
     ]
    }
   ],
   "source": [
    "count_vectorizer = CountVectorizer(ngram_range=(1, 3),\n",
    "                                   vocabulary=vocabulary[acceptable_words])\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])\n",
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "b021e7f3-0df9-447a-a07c-a2a6229b7c94",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `n_gram_to_unigram` takes as key an index to an n-gram in the vocabulary\n",
    "# and its value is a list of the vocabulary indices of the corresponding \n",
    "# unigrams.\n",
    "n_gram_indices = np.where(\n",
    "  np.array([len(word.split(' ')) for word in vocabulary]) > 1)[0]\n",
    "n_gram_to_unigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    matching_unigrams = []\n",
    "    for unigram in vocabulary[n_gram_index].split(' '):\n",
    "        if unigram in vocabulary:\n",
    "            matching_unigrams.append(np.where(vocabulary == unigram)[0][0])\n",
    "    n_gram_to_unigrams[n_gram_index] = matching_unigrams\n",
    "\n",
    "# `n_grams_to_bigrams` now breaks apart trigrams and higher to find bigrams \n",
    "# as subsets of these words.\n",
    "n_grams_to_bigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    split_n_gram = vocabulary[n_gram_index].split(' ')\n",
    "    n_gram_length = len(split_n_gram) \n",
    "    if n_gram_length > 2:\n",
    "        bigram_matches = []\n",
    "        for i in range(0, n_gram_length - 1):\n",
    "            bigram = \" \".join(split_n_gram[i:(i + 2)])\n",
    "            if bigram in vocabulary:\n",
    "                bigram_matches.append(np.where(vocabulary == bigram)[0][0])\n",
    "        n_grams_to_bigrams[n_gram_index] = bigram_matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "7dd90905-8ebb-463b-a1fa-75a99196d121",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 58941/58941 [01:34<00:00, 620.92it/s] \n"
     ]
    }
   ],
   "source": [
    "# Go through counts, and remove a unigram each time a bigram superset \n",
    "# appears. Also remove a bigram each time a trigram superset appears.\n",
    "# Note this isn't perfect: if bigrams overlap (e.g. \"global health care\" \n",
    "# contains \"global health\" and \"health care\"), we count them both. This\n",
    "# may introduce a problem where we subract a unigram count twice, so we also\n",
    "# ensure non-negativity.\n",
    "#counts_dense = counts.toarray()\n",
    "for i in tqdm(range(counts.shape[0])):\n",
    "    n_grams_in_doc = np.where(counts[i, n_gram_indices].toarray() > 0)[0]\n",
    "    sub_n_grams = n_gram_indices[n_grams_in_doc]\n",
    "    for n_gram in sub_n_grams:\n",
    "        counts[i, n_gram_to_unigrams[n_gram]] = sparse.csr_matrix(counts[i, n_gram_to_unigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "        if n_gram in n_grams_to_bigrams:\n",
    "            counts[i, n_grams_to_bigrams[n_gram]] = sparse.csr_matrix(counts[i, n_grams_to_bigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "counts[counts < 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "02b80c89-bc5b-4374-854e-86cf001711bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(58941, 11433)\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "c796c69b-86cd-4686-adae-7270095af5d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 58941/58941 [00:03<00:00, 17853.80it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(58932, 11433)\n",
      "(58932,)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Remove speeches with no words.\n",
    "existing_speeches = []#np.where(np.sum(counts_dense, axis=1) > 0)[0]\n",
    "for i in tqdm(range(counts.shape[0])):\n",
    "    if counts[i].sum() > 0:\n",
    "        existing_speeches.append(i)\n",
    "counts = counts[existing_speeches]\n",
    "print(counts.shape)\n",
    "author_indices = author_indices[existing_speeches]\n",
    "print(author_indices.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "2f33b56a-0d52-4826-b75b-a89f0236e744",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save data.\n",
    "\n",
    "# `counts.npz` is a [num_documents, num_words] sparse matrix containing the\n",
    "# word counts for each document.\n",
    "sparse.save_npz(\"clean_removing_procedural/counts.npz\",\n",
    "                counts.astype(np.float32))\n",
    "\n",
    "# `author_indices.npy` is a [num_documents] vector where each entry is an\n",
    "# integer indicating the author of the corresponding document.\n",
    "np.save(\"clean_removing_procedural/author_indices.npy\", author_indices)\n",
    "\n",
    "# `vocabulary.txt` is a [num_words] vector where each entry is a string\n",
    "# denoting the corresponding word in the vocabulary.\n",
    "np.savetxt(\"clean_removing_procedural/vocabulary.txt\", vocabulary, fmt=\"%s\")\n",
    "\n",
    "# `author_map.txt` is a [num_authors] vector of strings providing the bioguide ID of\n",
    "# each author in the corpus.\n",
    "np.savetxt(\"clean_removing_procedural/author_map.txt\", author_map, fmt=\"%s\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7a378a1c-cc77-48ce-86b8-f537fabd21ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `raw_documents.txt` contains all the documents we ended up using.\n",
    "raw_documents = [document.replace(\"\\n\", ' ').replace(\"\\r\", ' ') \n",
    "                 for i, document in enumerate(speeches) if i in existing_speeches]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "b9279f15-faeb-4b79-89b4-0f2f7e3cc4de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "58932\n"
     ]
    }
   ],
   "source": [
    "print(len(raw_documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5b162fac-bc85-400e-a68c-f4d1a149d7a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('clean_removing_procedural/raw_documents.txt', 'w')\n",
    "for i, doc in enumerate(raw_documents):\n",
    "    f.write(doc)\n",
    "    if i < len(raw_documents) - 1:\n",
    "        f.write('\\n')\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "eb529836-e767-4bbb-819d-f89609418992",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 58932 entries, 0 to 84062\n",
      "Data columns (total 5 columns):\n",
      " #   Column               Non-Null Count  Dtype \n",
      "---  ------               --------------  ----- \n",
      " 0   Speaker_Bioguide_ID  58932 non-null  object\n",
      " 1   Speaker_Name         58932 non-null  object\n",
      " 2   Text                 58932 non-null  object\n",
      " 3   Date                 58932 non-null  object\n",
      " 4   Legislative Body     58932 non-null  object\n",
      "dtypes: object(5)\n",
      "memory usage: 2.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data.iloc[existing_speeches]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "5672728b-ea8c-4cde-8eea-bf9c5709a27f",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.to_csv('finalized_tbip_speech_set_raw_original_data_floor_speeches_house_after_removing_procedural_speeches.csv',\n",
    "                index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ca4bd76",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:pg3] *",
   "language": "python",
   "name": "conda-env-pg3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
